{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-22T15:08:45.867425Z",
     "start_time": "2019-08-22T15:08:44.492924Z"
    }
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import os\n",
    "from tqdm import *\n",
    "import re\n",
    "import numpy as np\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-22T15:08:45.871389Z",
     "start_time": "2019-08-22T15:08:45.869188Z"
    }
   },
   "outputs": [],
   "source": [
    "path = 'data/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-22T15:08:46.359715Z",
     "start_time": "2019-08-22T15:08:46.343704Z"
    }
   },
   "outputs": [],
   "source": [
    "# 读入数据（需加速）\n",
    "def get_age_data():\n",
    "    train_data = pd.read_csv(path + 'age_train.csv', header=None)\n",
    "    test_data = pd.read_csv(path + 'age_test.csv', header=None)\n",
    "    data = pd.concat([train_data, test_data], axis=0, sort=False).fillna(-1)\n",
    "    data.columns = ['uId', 'age_group']\n",
    "    return data\n",
    "\n",
    "def get_user_app_actived():\n",
    "    data = pd.read_csv(path + 'user_app_actived.csv', header=None)\n",
    "    data.columns = ['uId', 'appId']\n",
    "    return data\n",
    "\n",
    "def get_user_behavior_info():\n",
    "    data = pd.read_csv(path + 'user_behavior_info.csv', header=None)\n",
    "    data.columns = ['uId', 'bootTimes', 'AFuncTimes', 'BFuncTimes', 'CFuncTimes',\n",
    "                   'DFuncTimes', 'EFuncTimes', 'FFuncTimes', 'FFuncSum']\n",
    "    return data\n",
    "\n",
    "def get_user_basic_info():\n",
    "    data = pd.read_csv(path + 'user_basic_info.csv', header=None)\n",
    "    data.columns = ['uId', 'gender', 'city', 'prodName', 'ramCapacity', \n",
    "                   'ramLeftRation', 'romCapacity', 'romLeftRation', 'color',\n",
    "                   'fontSize', 'ct', 'carrier', 'os']\n",
    "    return data\n",
    "\n",
    "def get_app_info():\n",
    "    data = pd.read_csv(path + 'app_info.csv', header=None)\n",
    "    data.columns = ['appId', 'category']\n",
    "    return data\n",
    "\n",
    "# 测试的时候用True\n",
    "# 提特征改用False\n",
    "def get_user_app_usage(less_data=False):\n",
    "    if less_data:\n",
    "        reader = pd.read_csv(path + 'user_app_usage.csv', chunksize=2000000)\n",
    "        for i in reader:\n",
    "            data = i\n",
    "            break\n",
    "    else:\n",
    "        data = pd.read_csv(path + 'user_app_usage.csv', header=None)\n",
    "    data.columns = ['uId', 'appId', 'duration', 'times', 'use_date']\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5000000/5000000 [00:12<00:00, 398634.17it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "开始计算tf-idf特征\n",
      "计算结束\n",
      "开始进行一些前期处理\n",
      "处理完毕\n",
      "\n",
      "****开始跑 LogisticRegression ****\n",
      "1/5 0.55787625\n",
      "2/5 0.55755\n",
      "3/5 0.5568125\n",
      "4/5 0.5575725\n",
      "5/5 0.55856\n",
      "LogisticRegression 处理完毕\n",
      "五折结果 [0.55788, 0.55755, 0.55681, 0.55757, 0.55856]\n",
      "平均结果 0.557674\n",
      "\n",
      "****开始跑 SGDClassifier ****\n",
      "1/5 0.5381175\n",
      "2/5 0.53828625\n",
      "3/5 0.53728625\n",
      "4/5 0.537975\n",
      "5/5 0.539605\n",
      "SGDClassifier 处理完毕\n",
      "五折结果 [0.53812, 0.53829, 0.53729, 0.53798, 0.5396]\n",
      "平均结果 0.5382560000000001\n",
      "\n",
      "****开始跑 PassiveAggressiveClassifier ****\n",
      "1/5 0.44627375\n",
      "2/5 0.445645\n",
      "3/5 0.44823625\n",
      "4/5 0.44563\n",
      "5/5 0.44951\n",
      "PassiveAggressiveClassifier 处理完毕\n",
      "五折结果 [0.44627, 0.44564, 0.44824, 0.44563, 0.44951]\n",
      "平均结果 0.447058\n",
      "\n",
      "****开始跑 RidgeClassfiy ****\n",
      "1/5 0.542295\n",
      "2/5 0.5423875\n",
      "3/5 0.54163875\n",
      "4/5 0.54186375\n"
     ]
    }
   ],
   "source": [
    "# f1\n",
    "id_label_data = get_age_data()\n",
    "tqdm.pandas('获取特征')\n",
    "# app激活表计算tfidf作为特征\n",
    "data = get_user_app_actived()\n",
    "data = pd.merge(id_label_data, data, on='uId', how='left')\n",
    "data = data.fillna('无')\n",
    "data['appId'] = data['appId'].progress_apply(lambda row: str(row).replace('#', ' '))\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.metrics import accuracy_score\n",
    "import numpy as np\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.linear_model import PassiveAggressiveClassifier\n",
    "from sklearn.linear_model import RidgeClassifier\n",
    "from sklearn.naive_bayes import BernoulliNB\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.svm import LinearSVC\n",
    "\n",
    "from sklearn.metrics import roc_auc_score\n",
    "df_train = data[data['age_group'] != -1]\n",
    "df_test = data[data['age_group']==-1]\n",
    "\n",
    "############################ 加载数据 ############################\n",
    "data = pd.concat([df_train, df_test], axis=0, sort=False)\n",
    "data['appId'] = data['appId'].apply(lambda row:str(row))\n",
    "\n",
    "############################ tf-idf ############################\n",
    "print('开始计算tf-idf特征')\n",
    "tf = TfidfVectorizer(ngram_range=(1, 1))\n",
    "discuss_tf = tf.fit_transform(data['appId']).tocsr()\n",
    "print('计算结束')\n",
    "\n",
    "############################ 切分数据集 ##########################\n",
    "print('开始进行一些前期处理')\n",
    "train_feature = discuss_tf[:len(df_train)]\n",
    "score = df_train['age_group'] - 1\n",
    "test_feature = discuss_tf[len(df_train):]\n",
    "print('处理完毕')\n",
    "\n",
    "######################### 模型函数(返回sklean_stacking结果) ########################\n",
    "def get_sklearn_classfiy_stacking(clf, train_feature, test_feature, score, model_name, class_number, n_folds, train_num, test_num):\n",
    "    print('\\n****开始跑', model_name, '****')\n",
    "    stack_train = np.zeros((train_num, class_number))\n",
    "    stack_test = np.zeros((test_num, class_number))\n",
    "    score_mean = []\n",
    "    skf = StratifiedKFold(n_splits=n_folds, random_state=1017)\n",
    "    tqdm.desc = model_name\n",
    "    for i, (tr, va) in enumerate(skf.split(train_feature, score)):\n",
    "        clf.fit(train_feature[tr], score[tr])\n",
    "        score_va = clf._predict_proba_lr(train_feature[va])\n",
    "        score_te = clf._predict_proba_lr(test_feature)\n",
    "        score_single = accuracy_score(score[va], np.argmax(clf._predict_proba_lr(train_feature[va]), axis=1))\n",
    "        score_mean.append(np.around(score_single, 5))\n",
    "        print(f'{i+1}/{n_folds}', score_single)\n",
    "        stack_train[va] += score_va\n",
    "        stack_test += score_te\n",
    "    stack_test /= n_folds\n",
    "    stack = np.vstack([stack_train, stack_test])\n",
    "    df_stack = pd.DataFrame()\n",
    "    for i in range(stack.shape[1]):\n",
    "        df_stack['tfidf_ori_1_1_' + model_name + '_classfiy_{}'.format(i)] = stack[:, i]\n",
    "    print(model_name, '处理完毕')\n",
    "    return df_stack, score_mean\n",
    "\n",
    "model_list = [\n",
    "    ['LogisticRegression', LogisticRegression(random_state=1017, C=3)],\n",
    "    ['SGDClassifier', SGDClassifier(random_state=1017, loss='log')],\n",
    "    ['PassiveAggressiveClassifier', PassiveAggressiveClassifier(random_state=1017, C=2)],\n",
    "    ['RidgeClassfiy', RidgeClassifier(random_state=1017)],\n",
    "    ['LinearSVC', LinearSVC(random_state=1017)]\n",
    "]\n",
    "\n",
    "feature = pd.DataFrame()\n",
    "for i in model_list:\n",
    "    stack_result, score_mean = get_sklearn_classfiy_stacking(i[1], train_feature, test_feature, score, i[0], 6, 5, len(df_train), len(df_test))\n",
    "    feature = pd.concat([feature, stack_result], axis=1, sort=False)\n",
    "    print('五折结果', score_mean)\n",
    "    print('平均结果', np.mean(score_mean))\n",
    "feature.to_csv('lgb_feature/f1.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# f2\n",
    "id_label_data = get_age_data()\n",
    "tqdm.pandas('获取特征')\n",
    "# app激活表计算tfidf作为特征\n",
    "data = get_user_app_actived()\n",
    "data = pd.merge(id_label_data, data, on='uId', how='left')\n",
    "data = data.fillna('无')\n",
    "data['appId'] = data['appId'].progress_apply(lambda row: str(row).replace('#', ' '))\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.metrics import accuracy_score\n",
    "import numpy as np\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.linear_model import PassiveAggressiveClassifier\n",
    "from sklearn.linear_model import RidgeClassifier\n",
    "from sklearn.naive_bayes import BernoulliNB\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.svm import LinearSVC\n",
    "\n",
    "from sklearn.metrics import roc_auc_score\n",
    "df_train = data[data['age_group'] != -1]\n",
    "df_test = data[data['age_group']==-1]\n",
    "\n",
    "############################ 加载数据 ############################\n",
    "data = pd.concat([df_train, df_test], axis=0, sort=False)\n",
    "data['appId'] = data['appId'].apply(lambda row:str(row))\n",
    "\n",
    "############################ tf-idf ############################\n",
    "print('开始计算tf-idf特征')\n",
    "tf = TfidfVectorizer(ngram_range=(1, 1), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)\n",
    "discuss_tf = tf.fit_transform(data['appId']).tocsr()\n",
    "print('计算结束')\n",
    "\n",
    "############################ 切分数据集 ##########################\n",
    "print('开始进行一些前期处理')\n",
    "train_feature = discuss_tf[:len(df_train)]\n",
    "score = df_train['age_group'] - 1\n",
    "test_feature = discuss_tf[len(df_train):]\n",
    "print('处理完毕')\n",
    "\n",
    "######################### 模型函数(返回sklean_stacking结果) ########################\n",
    "def get_sklearn_classfiy_stacking(clf, train_feature, test_feature, score, model_name, class_number, n_folds, train_num, test_num):\n",
    "    print('\\n****开始跑', model_name, '****')\n",
    "    stack_train = np.zeros((train_num, class_number))\n",
    "    stack_test = np.zeros((test_num, class_number))\n",
    "    score_mean = []\n",
    "    skf = StratifiedKFold(n_splits=n_folds, random_state=1017)\n",
    "    tqdm.desc = model_name\n",
    "    for i, (tr, va) in enumerate(skf.split(train_feature, score)):\n",
    "        clf.fit(train_feature[tr], score[tr])\n",
    "        score_va = clf._predict_proba_lr(train_feature[va])\n",
    "        score_te = clf._predict_proba_lr(test_feature)\n",
    "        score_single = accuracy_score(score[va], np.argmax(clf._predict_proba_lr(train_feature[va]), axis=1))\n",
    "        score_mean.append(np.around(score_single, 5))\n",
    "        print(f'{i+1}/{n_folds}', score_single)\n",
    "        stack_train[va] += score_va\n",
    "        stack_test += score_te\n",
    "    stack_test /= n_folds\n",
    "    stack = np.vstack([stack_train, stack_test])\n",
    "    df_stack = pd.DataFrame()\n",
    "    for i in range(stack.shape[1]):\n",
    "        df_stack['tfidf_1_1_' + model_name + '_classfiy_{}'.format(i)] = stack[:, i]\n",
    "    print(model_name, '处理完毕')\n",
    "    return df_stack, score_mean\n",
    "\n",
    "model_list = [\n",
    "    ['LogisticRegression', LogisticRegression(random_state=1017, C=3)],\n",
    "    ['SGDClassifier', SGDClassifier(random_state=1017, loss='log')],\n",
    "    ['PassiveAggressiveClassifier', PassiveAggressiveClassifier(random_state=1017, C=2)],\n",
    "    ['RidgeClassfiy', RidgeClassifier(random_state=1017)],\n",
    "    ['LinearSVC', LinearSVC(random_state=1017)]\n",
    "]\n",
    "\n",
    "feature = pd.DataFrame()\n",
    "for i in model_list:\n",
    "    stack_result, score_mean = get_sklearn_classfiy_stacking(i[1], train_feature, test_feature, score, i[0], 6, 5, len(df_train), len(df_test))\n",
    "    feature = pd.concat([feature, stack_result], axis=1, sort=False)\n",
    "    print('五折结果', score_mean)\n",
    "    print('平均结果', np.mean(score_mean))\n",
    "feature.to_csv('lgb_feature/f2.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# f3\n",
    "id_label_data = get_age_data()\n",
    "tqdm.pandas('获取特征')\n",
    "# 行为特征\n",
    "data = get_user_behavior_info()\n",
    "data = pd.merge(id_label_data, data, on='uId', how='left')\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "feature = pd.DataFrame()\n",
    "for i in data.columns:\n",
    "    if i not in ['age_group', 'uId']:\n",
    "        feature[i] = data[i]\n",
    "feature.to_csv('lgb_feature/f3.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# f4\n",
    "id_label_data = get_age_data()\n",
    "tqdm.pandas('获取特征')\n",
    "# 用户基础特征\n",
    "data = get_user_basic_info()\n",
    "data = pd.merge(id_label_data, data, on='uId', how='left')\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "feature = data[['gender', 'ramCapacity', 'ramLeftRation', 'romCapacity', 'romLeftRation', 'fontSize', 'os']]\n",
    "feature['city'] = data['city'].fillna(-1).progress_apply(lambda row:int(str(row).split('c')[-1]))\n",
    "feature['prodName'] = data['prodName'].fillna(-1).progress_apply(lambda row:int(str(row).split('p')[-1]))\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "feature['color'] = LabelEncoder().fit_transform(data['color'])\n",
    "feature['color_length'] = data['color'].progress_apply(lambda row:len(row))\n",
    "def get_color(row):\n",
    "    if row[-1] == '色':\n",
    "        if len(row) == 3:\n",
    "            return row[1:]\n",
    "        return row\n",
    "    else:\n",
    "        return row[-1] + str('色')\n",
    "data['color_deal'] = data['color'].progress_apply(lambda row:get_color(row))\n",
    "data['color_deal'] = data['color_deal'].replace('母色', '光色').replace('境色', '光色').replace('版色', '光色').replace('槟色', '橘色').replace('翠色', '绿色').replace('蝶色', '粉色')\n",
    "feature['color_last'] = LabelEncoder().fit_transform(data['color_deal'])\n",
    "feature['ct'] = LabelEncoder().fit_transform(data['ct'].fillna('无'))\n",
    "feature['carrier'] = LabelEncoder().fit_transform(data['carrier'])\n",
    "feature['os'] = data['os']\n",
    "feature['os_1'] = data['os'].fillna(-1).progress_apply(lambda row:int(str(row).split('.')[0]))\n",
    "feature['os_2'] = data['os'].fillna(-1).progress_apply(lambda row:int(str(row).split('.')[-1]))\n",
    "for col in ['city', 'prodName', 'color', 'color_last']:\n",
    "    feature[col + 'value_rank'] = feature[col].map(feature[col].value_counts().rank()/len(feature[col].unique()))\n",
    "feature.to_csv('lgb_feature/f4.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# f5\n",
    "import shutil\n",
    "import os\n",
    "file = 'feature/f4.csv'\n",
    "file_dir = 'lgb_feature/f5.csv'\n",
    "shutil.copy(file,file_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-11T08:40:45.344935Z",
     "start_time": "2019-08-11T08:40:44.112571Z"
    }
   },
   "outputs": [],
   "source": [
    "# f6\n",
    "import shutil\n",
    "import os\n",
    "file = 'feature/f5.csv'\n",
    "file_dir = 'lgb_feature/f6.csv'\n",
    "shutil.copy(file,file_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2019-08-22T15:09:58.196Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "122"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "122"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
